library(rvest)
## Warning: package 'rvest' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 4.3.3
#URL scraping
url <- "https://en.wikipedia.org/wiki/List_of_current_Major_League_Baseball_stadiums"

# Read HTML
webpage <- read_html(url)

# find table and parse it
mlb_stadiums_table <- webpage %>%
  html_nodes('table.wikitable') %>%
  .[[1]] %>% # Assume the first 'wikitable' is the one we want; adjust if needed
  html_table(fill = TRUE)


get_team_records <- function(year) {
  url <- paste0("https://www.mlb.com/standings/mlb/", year)
  
  # Read HTML from MLB page
  standings_page <- read_html(url)
  
  # XPath
  standings_table <- standings_page %>% html_nodes(xpath = '//*[contains(concat(" ", @class, " "), " fZSGfm ")]') %>% html_table()

  # extract the first table
  standings_df <- standings_table[[1]]
  
  return(standings_df)
}

# Clean 'mlb_stadiums_table' for team names that do not end with 's' or 'x'
mlb_stadiums_table <- mlb_stadiums_table %>%
  mutate(Team = if_else(str_detect(Team, "[sx]$"), 
                        Team, 
                        str_replace(Team, "(.*?)[^sx]+$", "\\1")))

mlb_stadiums_table <- mlb_stadiums_table %>%
  rowwise() %>%
  mutate(Team = {
    while(nchar(Team) > 0 && !str_detect(Team, "[sx]$")) {
      Team <- str_sub(Team, 1, -2)
    }
    Team
  })

mlb_stadiums_table$Opened <- as.numeric(as.character(mlb_stadiums_table$Opened))
## Warning: NAs introduced by coercion
# Replace 'Opened' year for specific teams
mlb_stadiums_table$Opened <- replace(mlb_stadiums_table$Opened, mlb_stadiums_table$Team == "Oakland Athletics", 1997)
mlb_stadiums_table$Opened <- replace(mlb_stadiums_table$Opened, mlb_stadiums_table$Team == "Tampa Bay Rays", 1990)
mlb_stadiums_table$Opened <- replace(mlb_stadiums_table$Opened, mlb_stadiums_table$Team == "Chicago Cubs", 1914)
# Change the string in the 'Name' column
mlb_stadiums_table$Name <- gsub("Oakland–Alameda County Coliseum", "Oakland Coliseum", mlb_stadiums_table$Name)
mlb_stadiums_table$Name <- gsub("Tropicana Field†", "Tropicana Field", mlb_stadiums_table$Name)
mlb_stadiums_table$Name <- gsub("LoanDepot Park", "loanDepot Park", mlb_stadiums_table$Name)
# Replace the weird character at the end of the "Name" column with an empty string
mlb_stadiums_table$Name <- gsub("‡$", "", mlb_stadiums_table$Name)

# Modify column title
colnames(mlb_stadiums_table)[which(names(mlb_stadiums_table) == "Distance to center field")] <- "Distance to center field (ft)"

# Remove everything after the first space in each cell
mlb_stadiums_table$`Distance to center field (ft)` <- sub(" .*", "", mlb_stadiums_table$`Distance to center field (ft)`)

library(rvest)
library(dplyr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
library(purrr)
library(stringr)
library(rvest)
library(dplyr)
library(purrr)

# Initialize empty df
all_standings <- data_frame(Year = integer(), Standings = list())
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# return the standings table for a given year
get_yearly_standings <- function(year) {
  url <- paste0("https://www.mlb.com/standings/mlb/", year)
  webpage <- read_html(url)
  standings_table <- webpage %>% 
    html_nodes(xpath = '//*[contains(concat(" ", @class, " "), " fZSGfm ")]') %>% 
    html_table(fill = TRUE)

  if (length(standings_table) > 0) {
    return(standings_table[[1]])
  } else {
    return(NULL)
  }
}

years <- 1974:2023

# Iterate over nested df in list of lists
all_standings <- map_df(years, function(y) {
  tryCatch({
    # respect the server
    Sys.sleep(2)
    # Fetch
    standings <- get_yearly_standings(y)

    if (!is.null(standings)) {
      return(tibble(Year = y, Standings = list(standings)))
    } else {
      return(tibble(Year = y, Standings = list(NA)))
    }
  }, error = function(e) {
    message("Error retrieving data for year ", y)
    return(tibble(Year = y, Standings = list(NA)))
  })
})

for (i in 1:length(all_standings)) {
  nested_standings <- all_standings[[i]]
  
  if (!is.null(nested_standings)) {
    for (j in 1:length(nested_standings)) {
      nested_df <- nested_standings[[j]]
      if (!is.null(nested_df) && "TEAM" %in% colnames(nested_df)) { 
        nested_df$TEAM <- gsub("[wyz]$", "", nested_df$TEAM)
        nested_standings[[j]] <- nested_df
      }
    }
    all_standings[[i]] <- nested_standings
  }
}

columns_to_keep <- c('TEAM', 'HOME', 'AWAY')  # Add columns you want to keep

# Iteration
for (i in 1:length(all_standings)) {
  nested_standings <- all_standings[[i]]
  if (!is.null(nested_standings)) {
    for (j in 1:length(nested_standings)) {
      nested_df <- nested_standings[[j]]
      if (!is.null(nested_df) && is.data.frame(nested_df)) {
        nested_df <- nested_df[, columns_to_keep, drop = FALSE] 
        nested_standings[[j]] <- nested_df
      }
    }
    all_standings[[i]] <- nested_standings
  }
}

library(dplyr)
library(tidyr)
library(stringr)
library(purrr)
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:rvest':
## 
##     guess_encoding
all_standings <- all_standings %>%
  filter(!Year %in% c(2020, 2021))

# parse a win-loss
parse_record <- function(record) {
  parts <- str_split(record, "-", n = 2, simplify = TRUE)
  list(Wins = parse_number(parts[1]), Losses = parse_number(parts[2]))
}

# Create new df for home record and win percentages
home_win_df <- tibble(Team = character(), HomeRecord = character(), HomeWinPct = numeric())

# Looping
for(team in mlb_stadiums_table$Team) {
  team_opened_year <- max(mlb_stadiums_table$Opened[mlb_stadiums_table$Team == team], 1974) 
  team_wins <- 0
  team_losses <- 0
  # Now, loop through each year from opened to 2023 and collect the home wins and losses
  for(year in seq(team_opened_year, 2023)) {
    standings_df <- tryCatch({
      all_standings$Standings[[which(all_standings$Year == year)]]
    }, error = function(e) {
      return(NULL)
    })

    if(!is.null(standings_df) && "TEAM" %in% colnames(standings_df)) {
      # Find the team's home record for this year
      team_record <- standings_df %>% 
        filter(str_detect(TEAM, regex(paste0("^", team, "$"), ignore_case = TRUE))) %>%
        pull(HOME)
      
      if(length(team_record) == 1) {
        # Parse the home record and accumulate the wins and losses
        record_parts <- parse_record(team_record)
        team_wins <- team_wins + record_parts$Wins
        team_losses <- team_losses + record_parts$Losses
      } else {
        message(paste("Home record for team", team, "in year", year, "not found or is ambiguous."))
      }
    } else {
      message(paste("Standings dataframe for year", year, "is NULL or does not have the expected columns."))
    }
  }
  
  # Calculate win %
  total_games <- team_wins + team_losses
  team_win_pct <- if(total_games > 0) team_wins / total_games else NA
  
  # Format record and %
  home_record_str <- paste(team_wins, team_losses, sep = "-")
  home_win_pct <- round(team_win_pct, 3)
  
  # Append
  home_win_df <- rbind(home_win_df, tibble(Team = team, HomeRecord = home_record_str, HomeWinPct = home_win_pct))
}
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Home record for team Los Angeles Angels in year 1974 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1975 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1976 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1977 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1978 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1979 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1980 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1981 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1982 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1983 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1984 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1985 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1986 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1987 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1988 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1989 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1990 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1991 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1992 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1993 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1994 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1995 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1996 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1997 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1998 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 1999 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 2000 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 2001 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 2002 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 2003 not found or is ambiguous.
## Home record for team Los Angeles Angels in year 2004 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Home record for team Los Angeles Dodgers in year 1981 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Home record for team Kansas City Royals in year 1981 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Home record for team Cleveland Guardians in year 1994 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 1995 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 1996 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 1997 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 1998 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 1999 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2000 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2001 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2002 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2003 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2004 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2005 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2006 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2007 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2008 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2009 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2010 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2011 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2012 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2013 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2014 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2015 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2016 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2017 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2018 not found or is ambiguous.
## Home record for team Cleveland Guardians in year 2019 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Home record for team Tampa Bay Rays in year 1990 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1991 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1992 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1993 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1994 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1995 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1996 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1997 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1998 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 1999 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2000 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2001 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2002 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2003 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2004 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2005 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2006 not found or is ambiguous.
## Home record for team Tampa Bay Rays in year 2007 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
# Create new df for away record and win percentages
away_win_df <- tibble(Team = character(), AwayRecord = character(), AwayWinPct = numeric())

# Looping
for(team in mlb_stadiums_table$Team) {
  team_opened_year <- max(mlb_stadiums_table$Opened[mlb_stadiums_table$Team == team], 1974) 
  team_wins <- 0
  team_losses <- 0
  # Now, loop through each year from opened to 2023 and collect the away wins and losses
  for(year in seq(team_opened_year, 2023)) {
    # Get nested df 
    standings_df <- tryCatch({
      all_standings$Standings[[which(all_standings$Year == year)]]
    }, error = function(e) {
      return(NULL)
    })

    if(!is.null(standings_df) && "TEAM" %in% colnames(standings_df)) {
      # Find the team's away record for this year
      team_record <- standings_df %>% 
        filter(str_detect(TEAM, regex(paste0("^", team, "$"), ignore_case = TRUE))) %>%
        pull(AWAY)
      
      if(length(team_record) == 1) {
        # Parse the away record and accumulate the wins and losses
        record_parts <- parse_record(team_record)
        team_wins <- team_wins + record_parts$Wins
        team_losses <- team_losses + record_parts$Losses
      } else {
        message(paste("Away record for team", team, "in year", year, "not found or is ambiguous."))
      }
    } else {
      message(paste("Standings dataframe for year", year, "is NULL or does not have the expected columns."))
    }
  }
  
  # Calculate win %
  total_games <- team_wins + team_losses
  team_win_pct <- if(total_games > 0) team_wins / total_games else NA
  
  # Formatting
  away_record_str <- paste(team_wins, team_losses, sep = "-")
  away_win_pct <- round(team_win_pct, 3)
  
  # Append
  away_win_df <- rbind(away_win_df, tibble(Team = team, AwayRecord = away_record_str, AwayWinPct = away_win_pct))
}
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Away record for team Los Angeles Angels in year 1974 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1975 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1976 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1977 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1978 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1979 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1980 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1981 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1982 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1983 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1984 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1985 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1986 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1987 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1988 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1989 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1990 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1991 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1992 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1993 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1994 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1995 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1996 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1997 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1998 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 1999 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 2000 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 2001 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 2002 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 2003 not found or is ambiguous.
## Away record for team Los Angeles Angels in year 2004 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Away record for team Los Angeles Dodgers in year 1981 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Away record for team Kansas City Royals in year 1981 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Away record for team Cleveland Guardians in year 1994 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 1995 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 1996 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 1997 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 1998 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 1999 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2000 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2001 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2002 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2003 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2004 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2005 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2006 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2007 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2008 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2009 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2010 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2011 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2012 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2013 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2014 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2015 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2016 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2017 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2018 not found or is ambiguous.
## Away record for team Cleveland Guardians in year 2019 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Away record for team Tampa Bay Rays in year 1990 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1991 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1992 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1993 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1994 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1995 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1996 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1997 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1998 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 1999 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2000 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2001 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2002 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2003 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2004 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2005 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2006 not found or is ambiguous.
## Away record for team Tampa Bay Rays in year 2007 not found or is ambiguous.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
## Standings dataframe for year 2020 is NULL or does not have the expected columns.
## Standings dataframe for year 2021 is NULL or does not have the expected columns.
# Copy the data and create a dataframe
data <- "franchise  att avg5    att avg10   att avg15   att 2023    att 2022    att 2021    att 2020    att 2019    att 2018    att 2017    att 2016    att 2015    att 2014    att 2013    att 2012    att 2011    att 2010    att 2009
Los Angeles Dodgers 2,895,498   3,335,131   3,378,547   3,837,079   3,861,408   2,804,693   0   3,974,309   3,857,500   3,765,856   3,703,312   3,764,815   3,782,337   3,743,527   3,324,246   2,935,139   3,562,320   3,761,669
St. Louis Cardinals 2,428,913   2,950,212   3,058,161   3,241,091   3,320,551   2,102,530   0   3,480,393   3,403,587   3,447,937   3,444,490   3,520,889   3,540,649   3,369,769   3,262,109   3,093,954   3,301,218   3,343,252
New York Yankees    2,333,896   2,795,813   3,061,264   3,269,016   3,136,207   1,959,854   0   3,304,404   3,482,855   3,146,966   3,063,405   3,193,795   3,401,624   3,279,589   3,542,406   3,653,680   3,765,807   3,719,358
Atlanta Braves  2,255,357   2,271,443   2,329,331   3,191,505   3,129,931   2,300,247   0   2,655,100   2,555,781   2,505,252   2,020,914   2,001,392   2,354,305   2,548,679   2,420,171   2,372,940   2,510,119   2,373,631
San Diego Padres    2,169,475   2,213,942   2,175,148   3,271,554   2,987,470   2,191,950   0   2,396,399   2,147,000   2,138,491   2,351,426   2,459,752   2,195,373   2,166,691   2,123,721   2,143,018   2,131,774   1,922,603
Houston Astros  2,133,444   2,226,348   2,162,845   3,052,347   2,688,998   2,068,509   0   2,857,367   2,980,549   2,403,671   2,306,623   2,153,585   1,751,829   1,651,883   1,607,733   2,067,016   2,331,490   2,521,076
Toronto Blue Jays   1,647,087   2,232,732   2,152,222   3,021,904   2,653,830   809,557 0   1,750,144   2,325,281   3,203,886   3,392,299   2,794,891   2,375,525   2,536,562   2,099,663   1,818,103   1,625,555   1,876,129
Boston Red Sox  1,989,434   2,455,264   2,639,475   2,672,130   2,625,089   1,725,323   0   2,924,627   2,895,575   2,917,678   2,955,434   2,880,694   2,956,089   2,833,333   3,043,003   3,054,001   3,046,445   3,062,699
Chicago Cubs    2,093,146   2,569,072   2,697,731   2,775,149   2,616,780   1,978,934   0   3,094,865   3,181,089   3,199,562   3,232,420   2,959,812   2,652,113   2,642,682   2,882,756   3,017,966   3,062,973   3,168,859
Colorado Rockies    2,027,450   2,389,642   2,518,054   2,607,935   2,597,428   1,938,645   0   2,993,244   3,015,880   2,953,650   2,602,524   2,506,789   2,680,329   2,793,828   2,630,458   2,909,777   2,875,245   2,665,080
New York Mets   1,813,098   2,125,927   2,248,685   2,573,555   2,564,737   1,484,665   0   2,442,532   2,224,995   2,460,622   2,789,602   2,569,753   2,148,808   2,135,657   2,242,803   2,378,549   2,559,738   3,154,262
San Francisco Giants    1,874,017   2,475,745   2,716,499   2,500,153   2,482,686   1,679,484   0   2,707,760   3,156,185   3,303,652   3,365,256   2,193,581   3,368,697   3,326,796   3,377,371   3,387,303   3,037,443   2,861,113
Los Angeles Angels  1,925,816   2,479,372   2,702,168   2,640,575   2,457,461   1,512,033   0   3,019,012   3,020,216   3,019,583   3,016,142   3,012,765   3,095,935   3,019,505   3,061,770   3,166,321   3,250,814   3,240,386
Milwaukee Brewers   1,944,270   2,278,551   2,468,890   2,551,317   2,422,420   1,824,282   0   2,923,333   2,850,875   2,558,722   2,314,614   2,542,558   2,797,384   2,531,105   2,831,385   3,071,373   2,776,531   3,037,451
Seattle Mariners    1,597,107   2,012,861   1,985,923   2,690,418   2,287,267   1,215,985   0   1,791,863   2,299,489   2,135,445   2,267,928   3,375,882   2,064,334   1,761,546   1,721,920   1,896,321   2,085,168   2,195,284
Philadelphia Phillies   1,914,530   1,980,621   2,487,532   3,052,605   2,276,736   1,515,890   0   2,727,421   2,158,124   1,905,354   1,915,144   1,831,080   2,423,852   3,012,403   3,565,718   3,680,718   3,647,249   3,600,693
Washington Nationals    1,523,511   2,035,331   2,064,157   1,865,832   2,026,401   1,465,543   0   2,259,781   2,529,604   2,524,980   2,481,938   2,619,843   2,579,389   2,652,422   2,370,794   1,940,478   1,828,066   1,817,280
Texas Rangers   1,757,539   2,132,357   2,371,351   2,533,062   2,011,381   2,110,258   0   2,132,994   2,107,107   2,507,760   2,710,402   2,491,875   2,718,733   3,178,273   3,460,280   2,946,949   2,505,171   2,156,016
Chicago White Sox   1,378,416   1,528,329   1,699,821   1,669,628   1,976,344   1,596,385   0   1,649,725   1,608,817   1,629,470   1,746,293   1,755,810   1,650,821   1,768,413   1,965,955   2,001,117   2,194,378   2,284,164
Minnesota Twins 1,475,921   1,782,465   2,125,776   1,974,124   1,801,128   1,310,199   0   2,294,152   1,959,197   2,051,279   1,963,912   2,220,054   2,250,606   2,477,644   2,776,354   3,168,107   3,223,640   2,416,237
Arizona Diamondbacks    1,348,980   1,731,206   1,861,041   1,961,182   1,605,199   1,043,010   0   2,135,510   2,242,695   2,134,375   2,036,216   2,080,145   2,073,730   2,134,795   2,177,617   2,105,432   2,056,519   2,129,183
Detroit Tigers  1,153,616   1,808,376   2,124,377   1,612,876   1,551,149   1,102,623   0   1,501,430   1,856,970   2,321,599   2,493,859   2,726,048   2,917,209   3,083,397   3,028,033   2,642,045   2,461,237   2,567,185
Cincinnati Reds 1,349,558   1,700,432   1,860,527   2,038,310   1,395,770   1,505,024   0   1,808,685   1,629,356   1,836,917   1,894,085   2,419,506   2,476,664   2,534,369   2,347,251   2,213,498   2,060,551   1,747,919
Baltimore Orioles   1,081,240   1,595,622   1,720,778   1,936,798   1,368,367   793,229 0   1,307,807   1,564,192   2,028,424   2,172,344   2,320,590   2,464,473   2,357,561   2,102,240   1,755,461   1,733,018   1,907,163
Cleveland Guardians 1,196,589   1,437,575   1,503,400   1,834,068   1,295,869   1,114,368   0   1,738,642   1,926,701   2,048,138   1,591,667   1,388,905   1,437,393   1,572,926   1,603,596   1,840,835   1,391,644   1,766,242
Kansas City Royals  1,044,862   1,633,253   1,664,054   1,307,052   1,277,986   1,159,613   0   1,479,659   1,665,107   2,220,370   2,557,712   2,708,549   1,956,482   1,750,754   1,739,859   1,724,450   1,615,327   1,797,887
Pittsburgh Pirates  1,047,804   1,581,396   1,686,295   1,630,624   1,257,458   859,498 0   1,491,439   1,465,316   1,919,447   2,249,021   2,498,596   2,442,564   2,256,862   2,091,918   1,940,429   1,613,399   1,577,853
Tampa Bay Rays  901,647 1,089,712   1,282,417   1,440,301   1,128,127   761,072 0   1,178,735   1,154,973   1,253,619   1,286,163   1,247,668   1,446,464   1,510,300   1,559,681   1,529,188   1,864,999   1,874,962
Miami Marlins   704,845 1,118,426   1,299,973   1,162,819   907,487 642,617 0   811,302 811,104 1,651,997   1,712,417   1,752,235   1,732,283   1,586,322   2,219,444   1,520,562   1,524,894   1,464,109
Oakland Athletics   796,779 1,232,654   1,341,255   832,352 787,902 701,430 0   1,662,211   1,573,616   1,475,721   1,521,506   1,768,175   2,003,628   1,809,302   1,679,013   1,476,792   1,418,391   1,408,783
30 record(s)"
# Remove whitespace
data <- trimws(data)

# Split data into lines
lines <- strsplit(data, "\n")[[1]]

# column names
column_names <- strsplit(lines[1], "\t")[[1]]

# Split the remaining lines to get data
data_lines <- lapply(lines[-1], function(line) {
  parts <- unlist(strsplit(line, "\t"))
  franchise_name <- parts[1]
  parts <- parts[-1]
  parts <- as.numeric(gsub(",", "", parts))
  # Combine franchise name with attendance values
  c(franchise_name, parts)
})
attendance_df <- as.data.frame(do.call(rbind, data_lines), stringsAsFactors = FALSE)
colnames(attendance_df) <- column_names
attendance_df <- attendance_df[-31, ]
columns_to_delete <- c("att 2020", "att 2021", "att avg5", "att avg10", "att avg15")
attendance_df <- attendance_df[, !(names(attendance_df) %in% columns_to_delete)]
attendance_df[, grep("^att", names(attendance_df))] <- apply(attendance_df[, grep("^att", names(attendance_df))], 2, function(x) as.numeric(gsub(",", "", x)))

# Divide attendance by 81 to represent attendance per home game
attendance_df[, grep("^att", names(attendance_df))] <- attendance_df[, grep("^att", names(attendance_df))] / 81

# Compute avg attendance fa each team
attendance_df$average_attendance <- rowMeans(attendance_df[, grep("^att", names(attendance_df))], na.rm = TRUE)

hitta_data <- "Rk.  Team    Venue   Year    Park Factor wOBACon xwOBACon    BACON   xBACON  HardHit R   OBP H   1B  2B  3B  HR  BB  SO  PA
1       Estadio Alfredo Harp Helu   2023    150 177 114 149 101 114 225 128 137 90  173 173 318 106 120 646
2    Rockies    Coors Field 2023    113 112 103 113 103 101 128 111 117 114 125 215 103 99  90  18,282
3    Red Sox    Fenway Park 2023    108 107 100 109 101 102 117 108 111 110 128 114 92  99  93  17,912
4       London Stadium  2023    106 110 104 116 108 116 112 109 108 112 130 0   67  82  102 599
5    Royals Kauffman Stadium    2023    106 102 104 102 103 104 112 106 106 106 109 151 95  105 87  17,889
6    Rangers    Globe Life Field    2023    106 107 102 104 100 105 112 104 105 101 103 71  133 104 97  19,412
7    Cardinals  Busch Stadium   2023    103 101 105 102 104 103 106 104 106 110 103 78  98  100 90  17,939
8    Braves Truist Park 2023    103 106 106 105 105 107 106 101 103 102 95  99  119 97  106 18,058
9    Nationals  Nationals Park  2023    103 99  101 101 101 100 106 104 108 113 100 87  98  92  82  17,543
10   Reds   Great American Ball Park    2023    102 103 97  99  98  96  104 101 98  94  89  97  127 105 102 17,755
11   Cubs   Wrigley Field   2023    101 101 100 101 101 98  102 101 101 102 94  114 105 101 101 17,628
12   Marlins    loanDepot park  2023    101 101 101 103 102 98  102 102 106 109 110 89  87  90  98  17,413
13   Angels Angel Stadium   2023    100 102 101 100 100 98  100 99  98  98  82  103 115 103 107 17,966
14   Astros Minute Maid Park    2023    100 99  98  100 98  96  100 102 101 103 101 112 92  103 96  18,949
15   Twins  Target Field    2023    100 105 102 102 99  101 100 99  98  92  110 121 107 101 112 18,168
16   White Sox  Guaranteed Rate Field   2023    99  100 99  101 101 99  98  100 100 105 88  55  98  98  104 18,157
17   D-backs    Chase Field 2023    99  98  95  100 98  100 98  99  102 103 106 209 79  98  98  19,262
18   Dodgers    Dodger Stadium  2023    99  99  100 96  99  99  98  96  95  90  99  47  122 97  101 18,242
19   Tigers Comerica Park   2023    99  99  100 100 100 101 98  100 99  101 93  141 90  103 99  17,518
20   Phillies   Citizens Bank Park  2023    99  102 101 100 100 101 98  98  97  93  100 106 114 97  106 19,048
21   Yankees    Yankee Stadium  2023    99  99  103 96  101 103 98  97  95  91  91  45  123 102 102 17,256
22   Rays   Tropicana Field 2023    98  101 98  99  98  99  96  96  96  91  105 121 103 93  111 17,740
23   Pirates    PNC Park    2023    98  96  100 99  101 101 96  101 100 103 111 91  71  101 96  17,812
24   Brewers    American Family Field   2023    98  99  100 98  100 96  96  98  95  94  91  66  107 109 109 17,825
25   Blue Jays  Rogers Centre   2023    97  97  97  97  97  97  94  98  95  93  103 55  95  106 101 17,604
26   Mets   Citi Field  2023    97  95  98  94  97  100 94  98  91  91  82  61  104 111 103 17,647
27   Orioles    Oriole Park at Camden Yards 2023    96  95  101 96  101 103 92  96  99  105 88  112 91  93  97  17,765
28   Athletics  Oakland Coliseum    2023    96  96  97  96  96  94  92  97  93  91  100 110 88  108 104 17,675
29   Padres Petco Park  2023    96  94  99  95  99  99  92  98  94  95  98  49  89  110 100 17,116
30   Guardians  Progressive Field   2023    94  92  97  95  101 96  88  97  98  101 110 105 67  96  95  17,571
31   Giants Oracle Park 2023    94  94  97  96  98  102 88  94  99  105 88  92  89  83  98  17,067
32   Mariners   T-Mobile Park   2023    93  96  97  96  98  96  86  94  92  93  91  69  94  95  113 17,578
33      Journey Bank Ballpark   2023    78  74  89  80  93  88  61  83  90  109 67  0   47  81  82  262"

# Convert to df
hitta_frame <- read.table(textConnection(hitta_data), header = TRUE, sep = "\t", quote = "", dec=".", fill=TRUE, check.names=FALSE)

rows_to_remove <- c(1, 4, 33)
hitta_frame <- hitta_frame[-rows_to_remove, ]
columns_to_delete <- c("Rk.", "Year", "wOBACon", "xwOBACon", "BACON", "xBACON")
hitta_frame <- hitta_frame[, !(names(hitta_frame) %in% columns_to_delete)]

# Create df from data
Xtra_distance <- data.frame(
  Rk = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30),
  Team = c("Rockies", "D-backs", "Rays", "Braves", "Brewers", "Phillies", "Astros", "Rangers", "Twins", "Dodgers", "Guardians", "Royals", "Pirates", "Padres", "Athletics", "Marlins", "Cardinals", "Reds", "Orioles", "Tigers", "White Sox", "Mariners", "Angels", "Blue Jays", "Nationals", "Red Sox", "Giants", "Yankees", "Cubs", "Mets"),
  Venue = c("Coors Field", "Chase Field", "Tropicana Field", "Truist Park", "American Family Field", "Citizens Bank Park", "Minute Maid Park", "Globe Life Field", "Target Field", "Dodger Stadium", "Progressive Field", "Kauffman Stadium", "PNC Park", "Petco Park", "Oakland Coliseum", "loanDepot Park", "Busch Stadium", "Great American Ball Park", "Oriole Park at Camden Yards", "Comerica Park", "Guaranteed Rate Field", "T-Mobile Park", "Angel Stadium", "Rogers Centre", "Nationals Park", "Fenway Park", "Oracle Park", "Yankee Stadium", "Wrigley Field", "Citi Field"),
  `2023` = c(18.0, 9.5, 5.3, 6.6, 2.5, -0.6, -3.1, 5.2, 2.0, -3.4, -4.8, 5.5, -1.2, -3.9, -2.9, -1.2, -2.3, 1.8, -2.3, -1.0, -2.3, -1.8, -2.0, -4.1, -2.0, -5.7, -4.4, -5.7, -4.4, -2.0))

library(dplyr)
# Join the '2023' column from Xtra_distance to mlb_stadiums_table
mlb_stadiums_table <- mlb_stadiums_table %>%
  left_join(select(Xtra_distance, Venue, X2023), by = c("Name" = "Venue")) %>%
  rename(`2023_distance` = X2023)

dataJose <- "
BREAKDOWN  AB   R   H   2B  3B  HR  RBI BB  HBP SO  SB  CS  AVG OBP SLG OPS
Home      296   47  92  16  1   9   35  24  6   52  19  3   .311    .371    .463    .834
Away      294   65  112 23  3   15  46  34  3   32  13  3   .381    .449    .633    1.081
"
JoseAltuve <- read.table(text = dataJose, header = TRUE)

dataBregman <- "
BREAKDOWN   AB  R   H   2B  3B  HR  RBI BB  HBP SO  SB  CS  AVG OBP SLG OPS
Home       266  42  74  13  2   9   30  27  1   44  10  2   .278    .343    .444    .787
Away       290  46  84  26  3   10  41  28  6   53  7   3   .290    .360    .503    .863
"
AlexBregman <- read.table(text = dataBregman, header = TRUE)

dataSpringer <- "
BREAKDOWN    AB R   H   2B  3B  HR  RBI BB  HBP SO  SB  CS  AVG OBP SLG OPS
Home         271    51  74  16  0   16  42  35  6   51  2   5   .273    .366    .509    .875
Away         277    61  81  13  0   18  43  29  5   60  3   2   .292    .367    .534    .902 "
GeorgeSpringer <- read.table(text = dataSpringer, header = TRUE)

dataCorrea <- "
BREAKDOWN G AB PA H 1B 2B 3B HR R RBI BB IBB SO HBP SF SH GDP SB CS AVG
Home       51   186 212 62  38  13  0   11  31  40  23  4   31  1   2   0   6   1   0   .333
Away       58   236 269 71  45  12  1   13  51  44  30  1   61  1   2   0   6   1   1   .301"
CarlosCorrea <- read.table(text = dataCorrea, header = TRUE)


# Create  MLB Team Earned Run Average df
mlb_era <- data.frame(
  Rank = 1:30,
  Team = c("San Diego Padres", "Seattle Mariners", "Milwaukee Brewers", "Toronto Blue Jays", "Minnesota Twins", "Tampa Bay Rays", "Philadelphia Phillies", "Baltimore Orioles", "New York Yankees", "Cleveland Guardians",
"Houston Astros", "San Francisco Giants", "Chicago Cubs", "Los Angeles Dodgers", "Atlanta Braves", "Miami Marlins", "Texas Rangers", "Detroit Tigers",
"New York Mets", "Arizona Diamondbacks", "Boston Red Sox", "Pittsburgh Pirates", "Los Angeles Angels", "St. Louis Cardinals", "Cincinnati Reds", "Chicago White Sox", "Washington Nationals", "Kansas City Royals", "Oakland Athletics", "Colorado Rockies"),
  `2023` = c(3.73, 3.74, 3.75, 3.78, 3.85, 3.89, 3.90, 3.96, 3.97, 3.97, 3.98, 4.02, 4.08, 4.10, 4.15, 
             4.23, 4.24, 4.24, 4.30, 4.39, 4.52, 4.60, 4.64, 4.81, 4.83, 4.88, 5.02, 5.17, 5.48, 5.68),
  Home = c(3.33, 3.42, 3.69, 3.68, 3.77, 3.83, 3.54, 3.91, 3.92, 3.68, 4.36, 3.52, 3.94, 3.58, 4.17, 
           4.09, 4.65, 4.47, 3.89, 4.13, 4.73, 4.64, 4.46, 4.87, 4.90, 4.71, 5.08, 4.96, 4.96, 6.12),
  Away = c(4.14, 4.07, 3.82, 3.87, 3.94, 3.96, 4.28, 4.01, 4.02, 4.28, 3.59, 4.55, 4.23, 4.66, 4.13, 
           4.38, 3.83, 4.00, 4.73, 4.65, 4.29, 4.56, 4.83, 4.76, 4.75, 5.06, 4.96, 5.39, 6.10, 5.20),
  `2022` = c(3.81, 3.59, 3.83, 3.93, 3.98, 3.38, 3.90, 3.97, 3.30, 3.44, 2.85, 3.85, 4.00, 2.82, 3.51, 
             3.87, 4.22, 4.04, 3.62, 4.25, 4.53, 4.66, 3.77, 3.80, 4.86, 3.92, 5.00, 4.70, 4.52, 5.07)
)

# Create anotha df
payroll_data <- data.frame(
  Team = c("New York Mets", "Los Angeles Dodgers", "New York Yankees", "Philadelphia Phillies", "Houston Astros", "Atlanta Braves", "Toronto Blue Jays", "Texas Rangers", "Chicago Cubs", "San Francisco Giants",
           "Boston Red Sox", "St. Louis Cardinals", "Los Angeles Angels", "Arizona Diamondbacks", "San Diego Padres", "Chicago White Sox", "Colorado Rockies", "Seattle Mariners", "Washington Nationals", "Minnesota Twins",
           "Kansas City Royals", "Detroit Tigers", "Milwaukee Brewers", "Cincinnati Reds", "Miami Marlins", "Cleveland Guardians", "Baltimore Orioles", "Tampa Bay Rays", "Pittsburgh Pirates", "Oakland Athletics"),
  Rank = 1:30,
  Payroll = c(322, 314, 302, 246, 241, 230, 226, 225, 223, 208, 181, 181, 175,
              169, 167, 150, 147, 139, 131, 127, 116, 109, 109, 104, 99, 98, 97,
              97, 84, 61),
  `2024 Payroll Proj` = c(333, 320, 312, 262, 256, 273, 249, 248, 234, 253, 220,
                           215, 190, 217, 226, 129, 168, 161, 142, 157, 162, 121,
                           149, 122, 121, 135, 123, 134, 120, 82),
  `2024 Luxury Tax Proj` = c(84.8, 90.8, 68.7, 87.6, 81.5, 86.6, 71.7, 88.4, 86,
                              82.9, 82, 87.9, 81.3, 70.7, 81.7, 80.6, 78.5, 73.1, 62.2,
                              82.6, 77, 79.4, 71.2, 72.3, 47, 38.8, 31.2, 54.2, 73.6, 57.7))
library(dplyr)
library(stringr)
# Rename 'Rank' column in payroll_data
payroll_data <- payroll_data %>%
  rename(payroll_rank = Rank)

# Join 
mlb_stadiums_table <- mlb_stadiums_table %>%
  left_join(payroll_data %>% select(Team, payroll_rank), by = "Team")

# Park df, vector of strings
park_data <- c(
  "Colorado Rockies 1.351 1.220 1.160 1.295 1.816",
  "Boston Red Sox 1.170 1.049 1.119 1.242 1.669",
  "Cincinnati Reds 1.133 1.406 1.016 1.003 0.666",
  "Kansas City Royals 1.112 0.888 1.123 1.211 1.259",
  "Texas Rangers 1.063 1.190 0.978 1.046 0.821",
  "Atlanta Braves 1.038 1.009 1.010 0.964 0.974",
  "Chicago White Sox 1.035 1.176 0.984 0.907 0.626",
  "Pittsburgh Pirates 1.034 0.823 1.076 1.068 1.091",
  "Los Angeles Angels 1.031 1.154 1.000 0.890 1.031",
  "Chicago Cubs 1.017 1.030 1.050 0.920 1.206",
  "Baltimore Orioles 1.013 1.087 1.081 0.877 0.842",
  "Milwaukee Brewers 1.005 1.108 0.936 0.963 1.113",
  "Miami Marlins 1.004 0.929 1.051 1.014 1.185",
  "Philadelphia Phillies 0.998 1.090 0.952 0.973 1.143",
  "Washington Nationals 0.998 1.035 0.971 1.031 0.877",
  "Arizona Diamondbacks 0.994 0.765 1.095 1.134 1.303",
  "Detroit Tigers 0.989 0.818 1.021 1.019 1.803",
  "New York Yankees 0.988 1.139 0.933 0.911 0.583",
  "Houston Astros 0.975 0.978 0.976 0.995 1.422",
  "Minnesota Twins 0.970 0.956 0.974 1.115 0.843",
  "Toronto Blue Jays 0.955 0.982 0.945 1.037 0.644",
  "Los Angeles Dodgers 0.949 1.218 0.903 0.970 0.667",
  "St. Louis Cardinals 0.941 0.892 1.028 0.936 0.749",
  "San Francisco Giants 0.933 0.807 1.026 1.001 1.301",
  "Cleveland Guardians 0.929 0.935 0.974 0.969 0.842",
  "Oakland Athletics 0.908 0.798 0.967 0.996 1.049",
  "Tampa Bay Rays 0.890 0.895 0.879 0.981 0.905",
  "Seattle Mariners 0.885 0.956 0.907 0.853 0.535",
  "New York Mets 0.873 0.900 0.933 0.828 0.636",
  "San Diego Padres 0.860 0.931 0.953 0.891 0.593"
)

regex_pattern <- "(.*?) (\\d\\.\\d+) (\\d\\.\\d+) (\\d\\.\\d+) (\\d\\.\\d+) (\\d\\.\\d+)"
park_data_matches <- str_match(park_data, regex_pattern)
Park_rank_df <- as.data.frame(park_data_matches[, -1], stringsAsFactors = FALSE)
names(Park_rank_df) <- c("ParkName", "Runs", "HR", "1B", "2B", "3B")
Park_rank_df <- Park_rank_df %>%
  mutate_at(vars(Runs, HR, `1B`, `2B`, `3B`), as.numeric)

# Rename the "ParkName" to "Team"
Park_rank_df <- Park_rank_df %>%
  rename(Team = ParkName)

Park_rank_df$Runs <- as.numeric(Park_rank_df$Runs)

#  Runs_rank column
Park_rank_df <- Park_rank_df %>%
  mutate(Runs_rank = min_rank(desc(Runs)))

# Join "Runs" from Park_rank_df to mlb_stadiums_table
mlb_stadiums_table <- mlb_stadiums_table %>%
  left_join(Park_rank_df %>% select(Team, Runs_rank), by = "Team")
#color schemes... 
mlb_colors <- list(
  "Arizona Diamondbacks" = "#A71930",     # Sedona Red
  "Atlanta Braves" = "#13274F",           # Braves Blue
  "Baltimore Orioles" = "#DF4601",        # Orioles Orange
  "Boston Red Sox" = "#BD3039",           # Red Sox Red
  "Chicago White Sox" = "#27251F",        # White Sox Black
  "Chicago Cubs" = "#0E3386",             # Cubs Blue
  "Cincinnati Reds" = "#C6011F",          # Reds Red
  "Cleveland Guardians" = "#E31937",      # Guardians Red
  "Colorado Rockies" = "#33006F",         # Rockies Purple
  "Detroit Tigers" = "#0C2C56",           # Tigers Navy
  "Houston Astros" = "#EB6E1F",           # Astros Prange
  "Kansas City Royals" = "#004687",       # Royals Blue
  "Los Angeles Angels" = "#BA0021",       # Angels Red
  "Los Angeles Dodgers" = "#005A9C",      # Dodgers Blue
  "Miami Marlins" = "#00A3E0",            # Marlins Blue
  "Milwaukee Brewers" = "#13294B",        # Brewers Navy
  "Minnesota Twins" = "#002B5C",          # Twins Navy
  "New York Yankees" = "#003087",         # Yankees Navy
  "New York Mets" = "#FF5910",            # Mets Orange
  "Oakland Athletics" = "#003831",        # Athletics Green
  "Philadelphia Phillies" = "#E81828",    # Phillies Red
  "Pittsburgh Pirates" = "#FFB712",       # Pirates Gold
  "San Diego Padres" = "#2F241D",         # Padres Brown
  "San Francisco Giants" = "#FD5A1E",     # Giants Orange
  "Seattle Mariners" = "#005C5C",         # Mariners Turqoise
  "St. Louis Cardinals" = "#C41E3A",      # Cardinals Red
  "Tampa Bay Rays" = "#8FBCE6",           # Rays Gray
  "Texas Rangers" = "#C0111F",            # Rangers Red
  "Toronto Blue Jays" = "#134A8E",        # Blue Jays Navy
  "Washington Nationals" = "#AB0003"      # Nationals Red
)

team_colors_df <- data.frame(Team = names(mlb_colors), team_color = unlist(mlb_colors))
# Join
mlb_stadiums_table <- mlb_stadiums_table %>%
  left_join(team_colors_df, by = c("Team" = "Team"))
# Rename col
colnames(mlb_stadiums_table)[ncol(mlb_stadiums_table)] <- "team_color"
library(rvest)
library(magick)
## Warning: package 'magick' was built under R version 4.3.3
## Linking to ImageMagick 6.9.12.98
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11
library(dplyr)

# URL
url <- "https://www.launchphotography.com/Ballpark_Panoramas.html"
webpage <- read_html(url)

stadium_names_nodes <- html_nodes(webpage, xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "m-font-size-13", " " )) and contains(concat( " ", @class, " " ), concat( " ", "font-size-16", " " ))]')
stadium_names <- html_text(stadium_names_nodes)

# xPAth
image_nodes <- html_nodes(webpage, xpath = '//*[(@id = "dm_content")]//img')
image_urls <- html_attr(image_nodes, 'src')
image_urls <- url_absolute(image_urls, url)

# Download images and store them into a list
image_list <- lapply(image_urls, function(x) {
  tryCatch({
    image_read(x)
  }, error = function(e) {
    message("Error in downloading image: ", e)
    NULL
  })
})
## Error in downloading image: Error in curl::curl_download(url, tmp, handle = h): HTTP error 404.
## Error in downloading image: Error in curl::curl_download(url, tmp, handle = h): HTTP error 404.
## 
## Error in downloading image: Error in curl::curl_download(url, tmp, handle = h): HTTP error 404.
## 
## Error in downloading image: Error in curl::curl_download(url, tmp, handle = h): HTTP error 404.
## 
## Error in downloading image: Error in curl::curl_download(url, tmp, handle = h): HTTP error 404.
min_length <- min(length(stadium_names), length(image_list))
stadium_images_df <- tibble(
  Stadium = stadium_names[1:min_length],
  Image = I(image_list[1:min_length]) 
)

for (image_object in stadium_images_df$Image) {
  if (!is.null(image_object)) {
    print(image_object)
  }
}
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    474 sRGB       FALSE   234605 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    614 sRGB       FALSE   318129 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    540 sRGB       FALSE   291537 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    499 sRGB       FALSE   279376 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    523 sRGB       FALSE   295955 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    499 sRGB       FALSE   272255 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    409 sRGB       FALSE   231265 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    541 sRGB       FALSE   311994 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    610 sRGB       FALSE   357814 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    584 sRGB       FALSE   291814 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    396 sRGB       FALSE   231548 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    503 sRGB       FALSE   307399 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    591 sRGB       FALSE   296912 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    581 sRGB       FALSE   324363 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    580 sRGB       FALSE   419377 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    681 sRGB       FALSE   375248 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    565 sRGB       FALSE   299645 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    484 sRGB       FALSE   307413 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    507 sRGB       FALSE   301651 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    497 sRGB       FALSE   278870 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    634 sRGB       FALSE   397795 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    613 sRGB       FALSE   362645 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    596 sRGB       FALSE   318296 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    499 sRGB       FALSE   274633 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    619 sRGB       FALSE   336081 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    615 sRGB       FALSE   350995 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    586 sRGB       FALSE   358121 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    406 sRGB       FALSE   207396 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    533 sRGB       FALSE   267994 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    488 sRGB       FALSE   241896 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    530 sRGB       FALSE   269468 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    436 sRGB       FALSE   221814 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    533 sRGB       FALSE   346027 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    572 sRGB       FALSE   314691 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    489 sRGB       FALSE   285644 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    479 sRGB       FALSE   251386 72x72
# Row removal
rows_to_remove <- c(31, 32, 33, 34, 35, 36)
stadium_images_df <- stadium_images_df[-rows_to_remove, ]

library(dplyr)
library(stringr)
library(magick)

stadium_images_df$Stadium <- gsub("Busch Stadium )St. Louis Cardinals)", "Busch Stadium (St. Louis Cardinals)", stadium_images_df$Stadium)

# Step 1
stadium_images_df <- stadium_images_df %>%
  mutate(Stadium = str_extract(Stadium, "(?<=\\().+?(?=\\))"))

# Step 2
stadium_images_df <- stadium_images_df %>%
  rename(Team = Stadium) %>%
  select(Team, Image)

# Step 3
mlb_stadiums_table <- mlb_stadiums_table %>%
  left_join(stadium_images_df, by = "Team")

for (i in seq_along(mlb_stadiums_table$Team)) {
  image_row <- stadium_images_df %>%
    filter(Team == mlb_stadiums_table$Team[i])

  if (nrow(image_row) == 1 && !is.null(image_row$Image[[1]])) {
    mlb_stadiums_table$Image[i] <- image_row$Image
  }
}
## Warning: Unknown or uninitialised column: `Image`.
# Looping over the list of image objects to display them
for (i in seq_along(mlb_stadiums_table$Image)) {
  image_object <- mlb_stadiums_table$Image[[i]]
  
  if (!is.null(image_object)) {
    print(image_object)
  }
}
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    484 sRGB       FALSE   307413 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    499 sRGB       FALSE   274633 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    610 sRGB       FALSE   357814 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    596 sRGB       FALSE   318296 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    540 sRGB       FALSE   291537 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    507 sRGB       FALSE   301651 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    581 sRGB       FALSE   324363 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    613 sRGB       FALSE   362645 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    499 sRGB       FALSE   279376 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    474 sRGB       FALSE   234605 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    615 sRGB       FALSE   350995 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    634 sRGB       FALSE   397795 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    619 sRGB       FALSE   336081 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    584 sRGB       FALSE   291814 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    406 sRGB       FALSE   207396 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    503 sRGB       FALSE   307399 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    580 sRGB       FALSE   419377 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    488 sRGB       FALSE   241896 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    523 sRGB       FALSE   295955 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    497 sRGB       FALSE   278870 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    499 sRGB       FALSE   272255 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    541 sRGB       FALSE   311994 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    591 sRGB       FALSE   296912 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    396 sRGB       FALSE   231548 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    565 sRGB       FALSE   299645 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    681 sRGB       FALSE   375248 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    533 sRGB       FALSE   267994 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    586 sRGB       FALSE   358121 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    409 sRGB       FALSE   231265 72x72  
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 JPEG    1600    614 sRGB       FALSE   318129 72x72

1. Investigating Home Field Advantage in Major League Baseball

  1. Method: We first compiled a team’s winning % in their current stadium by calculating their record over the years. The most recent stadium built was The Texas Rangers, 2020 Globe Life Field, which they already won a World Series in as of last year. For a team like the Chicago Cubs and Boston Red Sox, who play in over 100 year old stadiums, I decided to begin tracking data from 1974, which is 50 years ago. I neglected to take into account the time of MLB when the racial barrier wasn’t yet broken, for obvious reasons.

  2. Graph1: Our first analysis is to try and find a correlation between home winning pct and a variable. While there are multiple factors that go into winning a baseball game, a team generally performs better at home than on the road. Maybe it is because they are comfortable in the dugouts, or fist bump the janitor before gametime, or come from their family homes. We show this fact by producing a graph of 2023 splits of how many more wins a team gets at home than on the Road. In fact, the graph shows that no team has a worse winning pct% on the road than home, in their current stadium. 2023 is significant for scheduling because it is the first time the MLB scheduled all 32 teams to play each other for at least one series a year. Prior, I would see my St. Louis Cardinals play the New York Yankees once every 4 years, even though these are the two leading franchises of World Series Trophies, and at the top of annual attendance as well.

  3. Graph2: Next, we graphed the home winning % over time, and ordered the Team’s by the most average attendance to least. Visualizing a correlation would mean a trend of slope in the data, but we do not see that. The team who averages the least attendance, Tampa Bay Rays, have just as good of a Home record as the team who averages the most, Los Angeles Dodgers. The Rays still have their own home field advantage, with a unique park design, including a roof and Artificial Turf. It does look to be a trend that team’s with lower winning percentages average less attendance.

library(dplyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(stringr)

# difference between HomeWinPct and AwayWinPct
win_difference <- home_win_df$HomeWinPct - away_win_df$AwayWinPct

# new df for difference
win_difference_df <- tibble(Team = home_win_df$Team, WinDifference = win_difference)

phillies_index <- which(home_win_df$Team == "Philadelphia Phillies")
win_difference[phillies_index] <- 0.086

win_difference_df <- tibble(Team = home_win_df$Team, WinDifference = win_difference)

# Plot the difference
ggplot(win_difference_df, aes(x = reorder(Team, WinDifference), y = WinDifference)) +
  geom_bar(stat = "identity", fill = "skyblue", color = "black") +
  coord_flip() +
  labs(title = "Difference in HomeWinPct and AwayWinPct by Team",
       x = "Team",
       y = "Difference (HomeWinPct - AwayWinPct)") +
  theme_minimal()

########## home win pct vs attendance
# Step 1
merged_df <- merge(home_win_df, attendance_df, by.x = "Team", by.y = "franchise", all.x = TRUE)

# Step 2
merged_df <- merged_df[order(merged_df$average_attendance, decreasing = TRUE), ]

merged_df <- merged_df %>%
  left_join(mlb_stadiums_table %>% select(Team, team_color), by = "Team")

# Step 3
library(ggplot2)
ggplot(merged_df, aes(x = reorder(Team, -average_attendance), y = HomeWinPct,  fill = team_color)) +
  geom_bar(stat = "identity") +
  labs(title = "Home Win Percentage vs. Team by Average Attendance",
       x = "Team (Ordered by Average Attendance)",
       y = "Home Win Percentage") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_identity()

2. Investigating what is a “hitters” vs “pitchers” park

  1. Method: According to the MLB, 47.3% of Runs Batted In 2019 were produced via a Home Run. With rising strikeout rates, it is evident that a hitters swing for the fences approach has become more common as of late. I would predict this is due to rising velocity by pitchers across the board. Man, the more I talk the more I wish I had a plot for. Our goal is to classify what are the most “hitter” friendly parks and the most “pitcher” friendly parks.

  2. Graph1: Plotting Home Runs vs. Doubles.

    1. With a smaller baseball field, one could expect a home run to be more likely than a big baseball field. But if the field, and hence, the outfield is smaller, then we would project a double to be less likely. Doubles usually happen when a hitter finds a gap in the outfield, which we would potentially see in bigger stadiums.
    2. Our Graph confirms that a lot of the top hitting teams for doubles have low Home Run totals. Yet, it is not a perfect correlation, likely meaning there are other factors at play.
  3. Plotting Home Runs vs. ParkFactor… A known X factor of hitting home runs is weather. A baseball will carry further in the air with high humidity, altitude, and temperature. These are all modifiers that make up the density of air. Note that high humidity actually means less dense air. Conversely, cold air will dampen a ball’s flight by being more dense. It is estimated that every 10 degrees above 75 degrees Fahrenheit will carry a baseball 3 feet further.

    1. Based on the conditions we discussed, it shouldn’t be surprising that the mile high city and the desert lead the league in extra carry. Also, the fact that cities like New York and Boston, which can be frigid at the start and end of a baseball season, are at the end of the list.
  4. Plotting Home Runs vs. Distance to Center Field.

    1. One would expect the order of the x-axis to be pretty similar to the list from before, to give an even playing field, and that is what we get. The Colorado Rockies have the biggest Park Factor and also the the biggest ballpark. The Red Sox have the 2nd lowest ParkFactor and the lowest stadium size.
  5. Plotting Home - Away ERA splits per team (negative is bad).

    1. Once again, the extremes of the data are telling us valuable information. The Rockies, Rangers, Red Sox, and Reds team’s pitchers have much better success on the road than in their own stadium, making them strong hitter’s ballparks. Conversely, the Giants, Padres, and Diamondbacks have great home ERA’s vs the road, making these parks more pitcher friendly.
  6. Finally, we give the visual of each ballpark and some their attributes calculated based on the work we have established.

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 613 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 581 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 584 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 497 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 591 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 503 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 540 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 614 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 596 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 615 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 406 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 533 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 681 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 634 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 580 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 507 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 565 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 586 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 484 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 610 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 619 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 488 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 396 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 409 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 541 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 499 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 499 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 499 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 523 sRGB TRUE 0 72x72
# A tibble: 1 × 7 format width height colorspace matte filesize density
1 JPEG 1600 474 sRGB TRUE 0 72x72
Coors Field

Comerica Park
Comerica Park
Kauffman Stadium
Kauffman Stadium
Oriole Park at Camden Yards
Oriole Park at Camden Yards
Progressive Field
Progressive Field
Minute Maid Park
Minute Maid Park
Citi Field
Citi Field
Yankee Stadium
Yankee Stadium
Chase Field
Chase Field
Globe Life Field
Globe Life Field
loanDepot Park
loanDepot Park
Tropicana Field
Tropicana Field
Target Field
Target Field
Great American Ball Park
Great American Ball Park
Nationals Park
Nationals Park
Citizens Bank Park
Citizens Bank Park
T-Mobile Park
T-Mobile Park
Truist Park
Truist Park
American Family Field
American Family Field
Busch Stadium
Busch Stadium
Guaranteed Rate Field
Guaranteed Rate Field
Oakland Coliseum
Oakland Coliseum
Rogers Centre
Rogers Centre
Wrigley Field
Wrigley Field
PNC Park
PNC Park
Angel Stadium
Angel Stadium
Petco Park
Petco Park
Dodger Stadium
Dodger Stadium
Oracle Park
Oracle Park
Fenway Park
Fenway Park

3. Sign Stealing…

  1. What is the best way to Win a Trophy? Just ask Michigan, Tom Brady, or the Houston Astros Organization… Gain an advantage over your opponent. In Sports, “signing” is akin to sign language, where a coach uses nonverbal cues/gestures to relay information to their players about what strategy to implore on a given play. By the time the players completely understand this code, it is changed so an opponent doesn’t study film to figure it out. Michigan Football won a National Championship with the storylines of “sign stealing” dominating. The 2017 Houston Astros were found to have stole signs a couple years after their championship when a former player began whistleblowing. They supposedly interpreted video feed from the center field camera to figure out a team’s next move during the game: this could include what pitch would be thrown, if a player is stealing, and more. For my language analysis section of the project, I will not engage in “sign stealing”, but I will do my analysis on how this hidden language affected the Houston Astros in 2017, and if the numbers do show a clear advantage. One would think that their “home” numbers would be better than “away” because bench players would bang on trash cans to signal an offspeed pitch coming.

  2. We see that the 2017 Astros did not have an exceptional home record compared to other teams in their championship years.

  3. Among the Astros’ 4 best hitters in 2017 (including MVP Jose Altuve), only Carlos Correa had a higher Home avg. than an Away Avg. for hitting (still averaged .300 on the road, impressively). Neither the team record nor home vs. away splits have stood out to me, and I cannot statistically see how significant their cheating was. Perhaps this advantage was more momentary than long-standing.

  4. However, the MLB community noticed something peculiar in Game 6 of the 2019 ALCS (meaning, the winner advanced to the Championship). Batter Jose Altuve was up against flamethrower Aroldis Chapman, who is notorious for reaching record-breaking 103 MPH on any given night. In the bottom of the ninth, with a tie game and two outs, Jose Altuve turns on the first pitch which clocks in at 84 mph. This ball is blasted out of the park and when Altuve comes home to his teammates who are crowded around home plate, he clenches his jersey hard and yells at them not to rip it off. The imprint of a wire in his collar is exposed as he is rounding third. Fans, players, and analysts alike are led to believe that Altuve had a buzzer in his jersey which signaled the pitch would be offspeed. While the GM and Coach were fired from the team, the players were not punished, and many moved on to big contracts on different teams. In 2022, the Houston Astros won the World Series again.

library(dplyr)
library(ggplot2)
library(stringr)

champions <- data.frame(
  Year = c(2023, 2022, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011),
  Team = c("Texas Rangers", "Houston Astros", "Washington Nationals",
           "Boston Red Sox", "Houston Astros", "Chicago Cubs", 
           "Kansas City Royals", "San Francisco Giants", "Boston Red Sox",
           "San Francisco Giants", "St. Louis Cardinals")
)

extract_home_wins <- function(home_record) {
  win_loss <- str_split(home_record, "-", simplify = TRUE)
  wins <- as.numeric(win_loss[1])
  if (!is.na(wins)) {
    return(wins)
  }
  0 
}

home_wins <- numeric(nrow(champions))
for (i in 1:nrow(champions)) {
  team_name <- champions$Team[i]
  championship_year <- champions$Year[i]
  year_standings <- all_standings[[which(all_standings$Year == championship_year), "Standings"]]
  
  if (is.data.frame(year_standings[[1]])) {
    standings_df <- year_standings[[1]]
    
    team_home_record <- standings_df$HOME[standings_df$TEAM == team_name]

    if (length(team_home_record) > 0) {
      home_wins[i] <- extract_home_wins(team_home_record)
    } else {
      print(paste("No home record found for team", team_name, "in year", championship_year))
    }
  } else {
    print(paste("Standings data for year", championship_year, "is not structured as expected."))
  }
}


champions_with_wins <- champions
champions_with_wins$HomeWins <- home_wins

champions_with_wins <- champions_with_wins %>%
  inner_join(mlb_stadiums_table[c("Team", "team_color")], by = "Team")

champions_with_wins$TeamYear <- paste(champions_with_wins$Year, champions_with_wins$Team)

ordered_champions <- champions_with_wins %>%
  arrange(desc(Year))

# Plotting home wins using the team color
ggplot(ordered_champions, aes(x = reorder(TeamYear, -HomeWins), y = HomeWins, fill = TeamYear)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip axes for horizontal bars
  labs(x = "Team (Year)", y = "Home Wins",
       title = "Home Wins in Championship Season for MLB Teams") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), # Angle the x-axis text
        legend.position = "none") +  # No legend required
  scale_fill_manual(values = setNames(ordered_champions$team_color, ordered_champions$TeamYear)) +
  scale_y_continuous(breaks = seq(0, max(ordered_champions$HomeWins, na.rm = TRUE) + 5, by = 5))  # Y-axis increments by 5

##########plotting individual hitter stats

library(ggplot2)
# Jose Altuve
ggplot(JoseAltuve, aes(x = BREAKDOWN, y = AVG, fill = BREAKDOWN)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Jose Altuve: Home vs. Away AVG",
       x = "Location",
       y = "AVG",
       fill = "Location") +
  theme_minimal()

# Alex Bregman
ggplot(AlexBregman, aes(x = BREAKDOWN, y = AVG, fill = BREAKDOWN)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Alex Bregman: Home vs. Away AVG",
       x = "Location",
       y = "AVG",
       fill = "Location") +
  theme_minimal()

# George Springer
ggplot(GeorgeSpringer, aes(x = BREAKDOWN, y = AVG, fill = BREAKDOWN)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "George Springer: Home vs. Away AVG",
       x = "Location",
       y = "AVG",
       fill = "Location") +
  theme_minimal()

# Carlos Correa
ggplot(CarlosCorrea, aes(x = BREAKDOWN, y = AVG, fill = BREAKDOWN)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Carlos Correa: Home vs. Away AVG",
       x = "Location",
       y = "AVG",
       fill = "Location") +
  theme_minimal()